### Load standardpackages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
library(tidytext)
This session
Refresher:

Bag of words model
- In order for a computer to understand text we need to somehow find a useful representation.
- If you need to compare different texts e.g. articles, you will probably go for keywords. These keywords may come from a keyword-list with for example 200 different keywords
- In that case you could represent each document with a (sparse) vector with 1 for “keyword present” and 0 for “keyword absent”
- We can also get a bit more sophoistocated and count the number of times a word from our dictionary occurs.
- For a corpus of documents that would give us a document-term matrix.
Let’s try creating a bag of words model from our initial example.
text <- tibble(id = c(1:6),
text = c('A text about cats.',
'A text about dogs.',
'And another text about a dog.',
'Why always writing about cats and dogs, always dogs?',
'There are too little text about cats but to many about dogs',
'Cats, cats, cats! I love cats soo much. Cats are way better than dogs'))
text_tidy <- text %>%
unnest_tokens(word, text, token = 'words') %>%
count(id, word)
The document-term matrix (DTM)
- The simplest form of vector representation of text is a ddocument-term matrix
- How to we get a document-term matrix now?
- We could do it by hand, with well-known
dplyr syntax (Note: only works when you have one row per unique document-word pair)
text_tidy %>%
pivot_wider(names_from = word, values_from = n, values_fill = 0)
- We could also use
cast_dtm() to create a DTM in the format of the tm package.
text_dtm <- text_tidy %>%
cast_dtm(id, word, n)
text_dtm
<<DocumentTermMatrix (documents: 6, terms: 25)>>
Non-/sparse entries: 42/108
Sparsity : 72%
Maximal term length: 7
Weighting : term frequency (tf)
- We can simply convert ig to a tibble. Since there exists no direct transfer function, we have to first transform it to a matrix.
- Notice how we recover the rownames
text_dtm %>% as.matrix() %>% as_tibble(rownames = 'id')
- Sidenote: We can also tidy the DTM again to a tidy token-dataframe.
text_dtm %>% tidy()
- We also can directly use a similar function to cast a sparse matrix (which we for sure then also could transform to a tibble again)
text_tidy %>% cast_sparse(row = id, column = word, value = n)
6 x 25 sparse Matrix of class "dgCMatrix"
1 1 1 1 1 . . . . . . . . . . . . . . . . . . . . .
2 1 1 . 1 1 . . . . . . . . . . . . . . . . . . . .
3 1 1 . 1 . 1 1 1 . . . . . . . . . . . . . . . . .
4 . 1 1 . 2 1 . . 2 1 1 . . . . . . . . . . . . . .
5 . 2 1 1 1 . . . . . . 1 1 1 1 1 1 1 . . . . . . .
6 . . 5 . 1 . . . . . . 1 . . . . . . 1 1 1 1 1 1 1
- Finally, we could just apply a text recipe here
library(recipes)
library(textrecipes)
text %>%
recipe(~.) %>%
step_tokenize(text, token = 'words') %>% # tokenize
step_tf(text) %>% # TFIDF weighting
prep() %>% juice()
TF-IDF - Term Frequency - Inverse Document Frequency
- A token is important for a document if appears very often
- A token becomes less important for comparison across a corpus if it appears all over the place in the corpus
- Cat in a corpus of websites talking about cats is not that important
\[w_{i,j} = tf_{i,j}*log(\frac{N}{df_i})\]
- \(w_{i,j}\) = the TF-IDF score for a term i in a document j
- \(tf_{i,j}\) = number of occurence of term i in document j
- \(N\) = number of documents in the corpus
- \(df_i\) = number of documents with term i
# TFIDF weights
text_tidy %<>%
bind_tf_idf(term = word,
document = id,
n = n)
- We obviously could also cast a tf_idf weighted dtm…
text_tidy %>%
select(id, word, tf_idf) %>%
pivot_wider(names_from = word, values_from = tf_idf, values_fill = 0)
- btw: this is equivalent to just running a textrecipe like that:
text %>%
recipe(~.) %>%
step_tokenize(text, token = 'words') %>% # tokenize
step_tfidf(text) %>% # TFIDF weighting
prep() %>% juice()
- A last reminder on the powerful
pairwise_xx() functions from the widyr package
- For instance, pair
library(widyr)
text_tidy %>% pairwise_dist(id, word, tf_idf, method = "manhattan") %>%
mutate(similarity = 1 - (distance / max(distance)) ) %>%
select(-distance) %>%
arrange(desc(similarity))
Dimensionality reduction techniques
rm(list=ls())
- Ok, lets get first some more interesting
text %<>%
rename(id = X1) %>%
filter(language == 'en')
# preprocessing
text_tidy %<>%
#mutate(word = word %>% str_remove_all('[^[:alnum:]]')) %>% ## remove all special characters
filter(str_length(word) > 2 ) %>% # Remove words with less than 3 characters
group_by(word) %>%
filter(n() > 100) %>% # remove words occuring less than 100 times
ungroup() %>%
anti_join(stop_words, by = 'word') # remove stopwords
PCA
text_pca <- text_dtm %>%
column_to_rownames('id') %>%
prcomp(center = TRUE, scale. = TRUE)
text_pca
Standard deviations (1, .., p=6):
[1] 3.207823e+00 2.759080e+00 2.234460e+00 1.388305e+00 4.208882e-01 6.333504e-16
Rotation (n x k) = (25 x 6):
PC1 PC2 PC3 PC4 PC5 PC6
a -0.12298966 -0.20293146 0.3096529650 -0.16145250 -0.10294565 -0.242719650
about -0.27381282 0.16761993 -0.0523034736 0.01945303 0.03687218 0.621255802
cats 0.29282077 0.10623353 -0.0259909788 0.02013339 0.39484526 -0.139416189
text -0.22594973 0.04505067 0.2965162521 -0.09825013 -0.09569899 -0.609033136
dogs 0.05438080 0.04321488 -0.4058754433 -0.06481897 -0.83922505 -0.056366491
and -0.07300087 -0.21088577 -0.1718210259 0.48798584 0.03698842 -0.088662025
another -0.07475529 -0.14019613 0.1983802046 0.55517809 -0.11432042 0.019762232
dog -0.07475529 -0.14019613 0.1983802046 0.55517809 -0.11432042 0.019762232
always -0.01758432 -0.12655562 -0.4157185213 0.06208059 0.16110748 -0.177054662
why -0.01758432 -0.12655562 -0.4157185213 0.06208059 0.16110748 -0.177054662
writing -0.01758432 -0.12655562 -0.4157185213 0.06208059 0.16110748 -0.177054662
are 0.14435186 0.31529231 0.0002177812 0.12216722 -0.01817629 -0.173880335
but -0.12079837 0.32924621 -0.0403763595 0.09233359 0.01706538 -0.068982991
little -0.12079837 0.32924621 -0.0403763595 0.09233359 0.01706538 -0.068982991
many -0.12079837 0.32924621 -0.0403763595 0.09233359 0.01706538 -0.068982991
there -0.12079837 0.32924621 -0.0403763595 0.09233359 0.01706538 -0.068982991
to -0.12079837 0.32924621 -0.0403763595 0.09233359 0.01706538 -0.068982991
too -0.12079837 0.32924621 -0.0403763595 0.09233359 0.01706538 -0.068982991
better 0.30339063 0.06957053 0.0406518334 0.06219708 -0.04005677 0.004133281
i 0.30339063 0.06957053 0.0406518334 0.06219708 -0.04005677 0.004133281
love 0.30339063 0.06957053 0.0406518334 0.06219708 -0.04005677 0.004133281
much 0.30339063 0.06957053 0.0406518334 0.06219708 -0.04005677 0.004133281
soo 0.30339063 0.06957053 0.0406518334 0.06219708 -0.04005677 0.004133281
than 0.30339063 0.06957053 0.0406518334 0.06219708 -0.04005677 0.004133281
way 0.30339063 0.06957053 0.0406518334 0.06219708 -0.04005677 0.004133281
text_pca[['x']]
PC1 PC2 PC3 PC4 PC5 PC6
1 -0.9053472 -1.026254 1.3687094 -1.4697533 0.659153358 4.284767e-16
2 -0.9903765 -1.025903 0.8434948 -1.5666736 -0.667757859 1.538700e-15
3 -1.5702074 -2.178507 2.0217993 2.1842199 -0.041338206 -2.376571e-16
4 -0.3693521 -1.966548 -4.2368110 0.2442417 0.058256383 -5.152129e-16
5 -2.5373253 5.116156 -0.4114972 0.3632652 0.006170831 -1.451964e-15
6 6.3726085 1.081056 0.4143047 0.2447001 -0.014484506 -7.667478e-16
- Again, alternatively with a recipe…
text_pca <- text %>%
recipe(~.) %>%
update_role(id, new_role = "id") %>%
step_tokenize(text, token = 'words') %>% # tokenize
step_tfidf(text, prefix = NULL) %>% # TFIDF weighting
step_pca(all_predictors(), num_comp = 3) %>% # PCA
prep()
text_pca %>% juice()
text_pca %>%
tidy(3) %>%
filter(component %in% paste0("PC", 1:3)) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(value, terms, fill = terms)) +
geom_col(show.legend = FALSE) +
facet_wrap(~component, nrow = 1) +
labs(y = NULL)
library(embed)
text_UMAP <- text %>%
recipe(~.) %>%
update_role(id, new_role = "id") %>%
step_tokenize(text, token = 'words') %>% # tokenize
step_tfidf(text, prefix = NULL) %>% # TFIDF weighting
step_umap(all_predictors(), n_neighbors = 2) %>%
prep()
Error in uwot(X = X, n_neighbors = n_neighbors, n_components = n_components, :
n_neighbors must be smaller than the dataset size
Embeddings (Bonus)
glove6b <- embedding_glove42b(dimensions =100)
Error in embedding_glove42b(dimensions = 100) :
unused argument (dimensions = 100)
Summary
---
title: '(Somewhat) advanced NLP: text vectorization'
author: "Daniel S. Hain (dsh@business.aau.dk)"
date: "Updated `r format(Sys.time(), '%B %d, %Y')`"
output:
  html_notebook:
    code_folding: show
    df_print: paged
    toc: true
    toc_depth: 2
    toc_float:
      collapsed: false
    theme: flatly
---

```{r setup, include=FALSE}
### Generic preamble
rm(list=ls())
Sys.setenv(LANG = "en") # For english language
options(scipen = 5) # To deactivate annoying scientific number notation

### Knitr options
library(knitr) # For display of the markdown
knitr::opts_chunk$set(warning=FALSE,
                     message=FALSE,
                     comment=FALSE, 
                     fig.align="center"
                     )
```

```{r}
### Load standardpackages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
```

```{r}
library(tidytext)
```

# This session


# Refresher:

![](https://sds-aau.github.io/SDS-master/00_media/nlp_tidyworkflow.png)


# Bag of words model

* In order for a computer to understand text we need to somehow find a useful representation.
* If you need to compare different texts e.g. articles, you will probably go for keywords. These keywords may come from a keyword-list with for example 200 different keywords
* In that case you could represent each document with a (sparse) vector with 1 for "keyword present" and 0 for "keyword absent"
* We can also get a bit more sophoistocated and count the number of times a word from our dictionary occurs.
* For a corpus of documents that would give us a document-term matrix.

![example](https://i.stack.imgur.com/C1UMs.png)

Let's try creating a bag of words model from our initial example.

```{r}
text <- tibble(id = c(1:6),
               text = c('A text about cats.',
                        'A text about dogs.',
                        'And another text about a dog.',
                        'Why always writing about cats and dogs, always dogs?',
                        'There are too little text about cats but to many about dogs',
                        'Cats, cats, cats! I love cats soo much. Cats are way better than dogs'))
```

```{r}
text_tidy <- text %>% 
  unnest_tokens(word, text, token = 'words') %>% 
  count(id, word)
```


## The document-term matrix (DTM)

* The simplest form of vector representation of text is a ddocument-term matrix
* How to we get a document-term matrix now?
* We could do it by hand, with well-known `dplyr` syntax (Note: only works when you have one row per unique document-word pair)

```{r}
text_tidy %>%
  pivot_wider(names_from = word, values_from = n, values_fill = 0)
```

* We could also use `cast_dtm()` to create a DTM in the format of the `tm` package.

```{r}
text_dtm <- text_tidy %>%
  cast_dtm(id, word, n)
```

```{r}
text_dtm 
```

* We can simply convert ig to a tibble. Since there exists no direct transfer function, we have to first transform it to a matrix.
* Notice how we recover the rownames

```{r}
text_dtm %>% as.matrix() %>% as_tibble(rownames = 'id') 
```

* Sidenote: We can also tidy the DTM again to a tidy token-dataframe.

```{r}
text_dtm %>% tidy()
```
* We also can directly use a similar function to cast a sparse matrix (which we for sure then also could transform to a tibble again)

```{r}
text_tidy %>% cast_sparse(row = id, column = word, value = n)
```

* Finally, we could just apply a text recipe here

```{r}
library(recipes)
library(textrecipes)
```

```{r}
text %>%
  recipe(~.) %>% 
  step_tokenize(text, token = 'words') %>% # tokenize
  step_tf(text) %>% # TFIDF weighting
  prep() %>% juice()
```


## TF-IDF - Term Frequency - Inverse Document Frequency

* A token is important for a document if appears very often
* A token becomes less important for comparison across a corpus if it appears all over the place in the corpus
* *Cat* in a corpus of websites talking about cats is not that important

$$w_{i,j} = tf_{i,j}*log(\frac{N}{df_i})$$

- $w_{i,j}$ = the TF-IDF score for a term i in a document j
- $tf_{i,j}$ = number of occurence of term i in document j
- $N$ = number of documents in the corpus
- $df_i$ = number of documents with term i

```{r}
# TFIDF weights
text_tidy %<>%
  bind_tf_idf(term = word,
              document = id,
              n = n)
```

* We obviously could also cast a tf_idf weighted dtm...

```{r}
text_tidy %>%
  select(id, word, tf_idf) %>%
  pivot_wider(names_from = word, values_from = tf_idf, values_fill = 0)
```

* btw: this is equivalent to just running a textrecipe like that:

```{r}
text %>%
  recipe(~.) %>% 
  step_tokenize(text, token = 'words') %>% # tokenize
  step_tfidf(text) %>% # TFIDF weighting
  prep() %>% juice()
```

* A last reminder on the powerful `pairwise_xx()` functions from the `widyr` package
* For instance, pair

```{r}
library(widyr)
```

```{r}
text_tidy %>% pairwise_dist(id, word, tf_idf, method = "manhattan") %>%
  mutate(similarity = 1 - (distance / max(distance)) ) %>%
  select(-distance) %>%
  arrange(desc(similarity))
```



# Dimensionality reduction techniques

```{r}
rm(list=ls())
```

* Ok, lets get first some more interesting 

```{r}
text <- read_csv('https://github.com/SDS-AAU/SDS-master/raw/master/M2/data/cordis-h2020reports.gz')
```

```{r}
text %<>%
  rename(id = X1) %>%
  filter(language == 'en')
```



```{r}
# preprocessing
text_tidy %<>%
  #mutate(word = word %>% str_remove_all('[^[:alnum:]]')) %>% ## remove all special characters
  filter(str_length(word) > 2 ) %>% # Remove words with less than  3 characters
  group_by(word) %>%
  filter(n() > 100) %>% # remove words occuring less than 100 times
  ungroup() %>%
  anti_join(stop_words, by = 'word') # remove stopwords
```






```{r, include=FALSE}
text_dtm <- text %>%
  unnest_tokens(word, text, token = 'words') %>% 
  count(id, word) %>%
  pivot_wider(names_from = word, values_from = n, values_fill = 0)
```



## PCA

```{r}
text_pca <- text_dtm %>% 
  column_to_rownames('id') %>% 
  prcomp(center = TRUE, scale. = TRUE)
```

```{r}
text_pca %>% glimpse()
```

```{r}
text_pca[['x']]
```

* Again, alternatively with a recipe...

```{r}
text_pca <- text %>%
  recipe(~.) %>% 
  update_role(id, new_role = "id") %>%
  step_tokenize(text, token = 'words') %>% # tokenize
  step_tfidf(text, prefix = NULL) %>% # TFIDF weighting
  step_pca(all_predictors(), num_comp = 3) %>% # PCA
  prep() 
```

```{r}
text_pca %>% juice()
```

```{r}
text_pca %>%
  tidy(3) %>%
  filter(component %in% paste0("PC", 1:3)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(value, terms, fill = terms)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~component, nrow = 1) +
  labs(y = NULL)
```


```{r}
library(embed)
```

```{r}
text_UMAP <- text %>%
  recipe(~.) %>% 
  update_role(id, new_role = "id") %>%
  step_tokenize(text, token = 'words') %>% # tokenize
  step_tfidf(text, prefix = NULL) %>% # TFIDF weighting
  step_umap(all_predictors(), n_neighbors = 2) %>%
  prep() 
```







## Topic Models: LDA




```{r}
#UMAP
```



# Embeddings (Bonus)

```{r}
library(textdata)

glove6b <- embedding_glove27b(dimensions = 100)
glove6b
# These mebeddings can now be loaded with step_wordembeddings
```




# Summary






